library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(knitr)
library(formattable)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(leaps)
library(splines)
library(caret)
## Loading required package: lattice
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-8
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ lubridate 1.9.3 ✔ stringr 1.5.1
## ✔ purrr 1.0.2 ✔ tibble 3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ randomForest::combine() masks dplyr::combine()
## ✖ Matrix::expand() masks tidyr::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ randomForest::margin() masks ggplot2::margin()
## ✖ Matrix::pack() masks tidyr::pack()
## ✖ Matrix::unpack() masks tidyr::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
taxi_df <- read.csv("../taxi_Trips__2024.csv")
head(taxi_df)
{r}tlsg print(names(taxi_df)) print(nrow(taxi_df))
colSums(is.na(taxi_df))
## Trip.ID Taxi.ID
## 0 1
## Trip.Start.Timestamp Trip.End.Timestamp
## 0 0
## Trip.Seconds Trip.Miles
## 87 5
## Pickup.Census.Tract Dropoff.Census.Tract
## 266886 273775
## Pickup.Community.Area Dropoff.Community.Area
## 11105 42428
## Fare Tips
## 1014 1014
## Tolls Extras
## 1014 1014
## Trip.Total Payment.Type
## 1014 0
## Company Pickup.Centroid.Latitude
## 0 10989
## Pickup.Centroid.Longitude Pickup.Centroid.Location
## 10989 0
## Dropoff.Centroid.Latitude Dropoff.Centroid.Longitude
## 39982 39982
## Dropoff.Centroid..Location
## 0
(unique(taxi_df$Company))
## [1] "Flash Cab"
## [2] "Taxicab Insurance Agency Llc"
## [3] "Globe Taxi"
## [4] "5 Star Taxi"
## [5] "City Service"
## [6] "Chicago Independents"
## [7] "Blue Ribbon Taxi Association"
## [8] "Taxi Affiliation Services"
## [9] "Chicago City Taxi Association"
## [10] "Choice Taxi Association"
## [11] "Medallion Leasin"
## [12] "Sun Taxi"
## [13] "U Taxicab"
## [14] "Taxicab Insurance Agency, LLC"
## [15] "Choice Taxi Association Inc"
## [16] "Chicago Taxicab"
## [17] "Patriot Taxi Dba Peace Taxi Associat"
## [18] "Setare Inc"
## [19] "Taxi Affiliation Services Llc - Yell"
## [20] "3556 - 36214 RC Andrews Cab"
## [21] "Top Cab"
## [22] "Koam Taxi Association"
## [23] "312 Medallion Management Corp"
## [24] "Star North Taxi Management Llc"
## [25] "6574 - Babylon Express Inc."
## [26] "5167 - 71969 5167 Taxi Inc"
## [27] "2733 - 74600 Benny Jona"
## [28] "3591 - 63480 Chuks Cab"
## [29] "Tac - Yellow Cab Association"
## [30] "Metro Jet Taxi A."
## [31] "4787 - 56058 Reny Cab Co"
## [32] "4623 - 27290 Jay Kim"
## [33] "4053 - 40193 Adwar H. Nikola"
## [34] "Petani Cab Corp"
## [35] "Tac - Checker Cab Dispatch"
feature_desc <- read.csv("taxi_Trips_2024_Feature_descriptions.csv")
feature_desc
notes <- list()
for (feature in names(taxi_df)) {
curr_note <- paste("Valid rows:",(nrow(taxi_df) - sum(is.na(taxi_df[,feature]))),
"; N/A rows:", sum(is.na(taxi_df[,feature])),
"; Unique values:", length(unique(taxi_df[,feature])))
notes <- append(notes, curr_note)
}
notes
## [[1]]
## [1] "Valid rows: 425229 ; N/A rows: 0 ; Unique values: 425229"
##
## [[2]]
## [1] "Valid rows: 425228 ; N/A rows: 1 ; Unique values: 2520"
##
## [[3]]
## [1] "Valid rows: 425229 ; N/A rows: 0 ; Unique values: 2977"
##
## [[4]]
## [1] "Valid rows: 425229 ; N/A rows: 0 ; Unique values: 2988"
##
## [[5]]
## [1] "Valid rows: 425142 ; N/A rows: 87 ; Unique values: 6077"
##
## [[6]]
## [1] "Valid rows: 425224 ; N/A rows: 5 ; Unique values: 4278"
##
## [[7]]
## [1] "Valid rows: 158343 ; N/A rows: 266886 ; Unique values: 221"
##
## [[8]]
## [1] "Valid rows: 151454 ; N/A rows: 273775 ; Unique values: 347"
##
## [[9]]
## [1] "Valid rows: 414124 ; N/A rows: 11105 ; Unique values: 78"
##
## [[10]]
## [1] "Valid rows: 382801 ; N/A rows: 42428 ; Unique values: 78"
##
## [[11]]
## [1] "Valid rows: 424215 ; N/A rows: 1014 ; Unique values: 4642"
##
## [[12]]
## [1] "Valid rows: 424215 ; N/A rows: 1014 ; Unique values: 2161"
##
## [[13]]
## [1] "Valid rows: 424215 ; N/A rows: 1014 ; Unique values: 139"
##
## [[14]]
## [1] "Valid rows: 424215 ; N/A rows: 1014 ; Unique values: 865"
##
## [[15]]
## [1] "Valid rows: 424215 ; N/A rows: 1014 ; Unique values: 7816"
##
## [[16]]
## [1] "Valid rows: 425229 ; N/A rows: 0 ; Unique values: 7"
##
## [[17]]
## [1] "Valid rows: 425229 ; N/A rows: 0 ; Unique values: 35"
##
## [[18]]
## [1] "Valid rows: 414240 ; N/A rows: 10989 ; Unique values: 276"
##
## [[19]]
## [1] "Valid rows: 414240 ; N/A rows: 10989 ; Unique values: 276"
##
## [[20]]
## [1] "Valid rows: 425229 ; N/A rows: 0 ; Unique values: 277"
##
## [[21]]
## [1] "Valid rows: 385247 ; N/A rows: 39982 ; Unique values: 366"
##
## [[22]]
## [1] "Valid rows: 385247 ; N/A rows: 39982 ; Unique values: 365"
##
## [[23]]
## [1] "Valid rows: 425229 ; N/A rows: 0 ; Unique values: 366"
feature_desc$Notes <- unlist(notes)
feature_desc
# Install the formattable package if not already installed
if (!require(formattable)) {
install.packages("formattable")
}
formattable(feature_desc, align = c("l", "l", "l", "l"), list(Notes = formatter("span", style = "color:blue"), Type = formatter("span", style = "color:green"))
)
| Column.Name | Description | Type | Notes |
|---|---|---|---|
| Trip ID | A unique identifier for the trip. | Plain Text | Valid rows: 425229 ; N/A rows: 0 ; Unique values: 425229 |
| Taxi ID | A unique identifier for the taxi. | Plain Text | Valid rows: 425228 ; N/A rows: 1 ; Unique values: 2520 |
| Trip Start Timestamp | When the trip started, rounded to the nearest 15 minutes. | Date & Time | Valid rows: 425229 ; N/A rows: 0 ; Unique values: 2977 |
| Trip End Timestamp | When the trip ended, rounded to the nearest 15 minutes. | Date & Time | Valid rows: 425229 ; N/A rows: 0 ; Unique values: 2988 |
| Trip Seconds | Time of the trip in seconds. | Number | Valid rows: 425142 ; N/A rows: 87 ; Unique values: 6077 |
| Trip Miles | Distance of the trip in miles. | Number | Valid rows: 425224 ; N/A rows: 5 ; Unique values: 4278 |
| Pickup Census Tract | The Census Tract where the trip began. For privacy, this Census Tract is not shown for some trips. This column often will be blank for locations outside Chicago. | Plain Text | Valid rows: 158343 ; N/A rows: 266886 ; Unique values: 221 |
| Dropoff Census Tract | The Census Tract where the trip ended. For privacy, this Census Tract is not shown for some trips. This column often will be blank for locations outside Chicago. | Plain Text | Valid rows: 151454 ; N/A rows: 273775 ; Unique values: 347 |
| Pickup Community Area | The Community Area where the trip began. This column will be blank for locations outside Chicago. | Number | Valid rows: 414124 ; N/A rows: 11105 ; Unique values: 78 |
| Dropoff Community Area | The Community Area where the trip ended. This column will be blank for locations outside Chicago. | Number | Valid rows: 382801 ; N/A rows: 42428 ; Unique values: 78 |
| Fare | The fare for the trip. | Number | Valid rows: 424215 ; N/A rows: 1014 ; Unique values: 4642 |
| Tips | The tip for the trip. Cash tips generally will not be recorded. | Number | Valid rows: 424215 ; N/A rows: 1014 ; Unique values: 2161 |
| Tolls | The tolls for the trip. | Number | Valid rows: 424215 ; N/A rows: 1014 ; Unique values: 139 |
| Extras | Extra charges for the trip. | Number | Valid rows: 424215 ; N/A rows: 1014 ; Unique values: 865 |
| Trip Total | Total cost of the trip, the total of the previous columns. | Number | Valid rows: 424215 ; N/A rows: 1014 ; Unique values: 7816 |
| Payment Type | Type of payment for the trip. | Plain Text | Valid rows: 425229 ; N/A rows: 0 ; Unique values: 7 |
| Company | The taxi company. | Plain Text | Valid rows: 425229 ; N/A rows: 0 ; Unique values: 35 |
| Pickup Centroid Latitude | The latitude of the center of the pickup census tract or the community area if the census tract has been hidden for privacy. This column often will be blank for locations outside Chicago. | Number | Valid rows: 414240 ; N/A rows: 10989 ; Unique values: 276 |
| Pickup Centroid Longitude | The longitude of the center of the pickup census tract or the community area if the census tract has been hidden for privacy. This column often will be blank for locations outside Chicago. | Number | Valid rows: 414240 ; N/A rows: 10989 ; Unique values: 276 |
| Pickup Centroid Location | The location of the center of the pickup census tract or the community area if the census tract has been hidden for privacy. This column often will be blank for locations outside Chicago. | Point | Valid rows: 425229 ; N/A rows: 0 ; Unique values: 277 |
| Dropoff Centroid Latitude | The latitude of the center of the dropoff census tract or the community area if the census tract has been hidden for privacy. This column often will be blank for locations outside Chicago. | Number | Valid rows: 385247 ; N/A rows: 39982 ; Unique values: 366 |
| Dropoff Centroid Longitude | The longitude of the center of the dropoff census tract or the community area if the census tract has been hidden for privacy. This column often will be blank for locations outside Chicago. | Number | Valid rows: 385247 ; N/A rows: 39982 ; Unique values: 365 |
| Dropoff Centroid Location | The location of the center of the dropoff census tract or the community area if the census tract has been hidden for privacy. This column often will be blank for locations outside Chicago. | Point | Valid rows: 425229 ; N/A rows: 0 ; Unique values: 366 |
length(which(taxi_df$Fare + taxi_df$Tips + taxi_df$Tolls + taxi_df$Extras != taxi_df$Trip.Total))
## [1] 136222
features_to_drop <- c("Trip.End.Timestamp", "Pickup.Census.Tract", "Dropoff.Census.Tract", "Pickup.Centroid.Latitude", "Pickup.Centroid.Longitude", "Pickup.Centroid.Location", "Dropoff.Centroid.Latitude", "Dropoff.Centroid.Longitude", "Dropoff.Centroid..Location")
simplified_taxi_df <- subset(taxi_df, select = -c(Trip.End.Timestamp, Pickup.Census.Tract, Dropoff.Census.Tract, Pickup.Centroid.Latitude, Pickup.Centroid.Longitude, Pickup.Centroid.Location, Dropoff.Centroid.Latitude, Dropoff.Centroid.Longitude, Dropoff.Centroid..Location))
head(simplified_taxi_df)
dim(simplified_taxi_df)
## [1] 425229 14
colSums(is.na(simplified_taxi_df))
## Trip.ID Taxi.ID Trip.Start.Timestamp
## 0 1 0
## Trip.Seconds Trip.Miles Pickup.Community.Area
## 87 5 11105
## Dropoff.Community.Area Fare Tips
## 42428 1014 1014
## Tolls Extras Trip.Total
## 1014 1014 1014
## Payment.Type Company
## 0 0
cleaned_taxi_df <- simplified_taxi_df[!apply(is.na(simplified_taxi_df), 1, any), ]
colSums(is.na(cleaned_taxi_df))
## Trip.ID Taxi.ID Trip.Start.Timestamp
## 0 0 0
## Trip.Seconds Trip.Miles Pickup.Community.Area
## 0 0 0
## Dropoff.Community.Area Fare Tips
## 0 0 0
## Tolls Extras Trip.Total
## 0 0 0
## Payment.Type Company
## 0 0
dim(cleaned_taxi_df)
## [1] 379024 14
head(cleaned_taxi_df)
cleaned_taxi_df$Trip.Minutes <- round(cleaned_taxi_df$Trip.Seconds / 60, digits = 2)
cleaned_taxi_df$Trip.Seconds <- NULL
head(cleaned_taxi_df)
#cleaned_taxi_df$Trip.Start.Timestamp <- as.POSIXct(cleaned_taxi_df$Trip.Start.Timestamp, format = "%m/%d/%Y %I:%M:%S %p")
# ---------------------------------------------------------------------------------
# If you are facing any errors with the format used above, try using the one below
# ---------------------------------------------------------------------------------
cleaned_taxi_df$Trip.Start.Timestamp <- as.POSIXct(cleaned_taxi_df$Trip.Start.Timestamp, format = "%m/%d/%y %H:%M")
cleaned_taxi_df$Trip.Start.Date <- as.Date(cleaned_taxi_df$Trip.Start.Timestamp)
cleaned_taxi_df$Trip.Hour.Of.The.Day <- as.integer(format(cleaned_taxi_df$Trip.Start.Timestamp, format = "%H"))
cleaned_taxi_df$Trip.Hour.Of.The.Day <- as.factor(cleaned_taxi_df$Trip.Hour.Of.The.Day)
days_of_week <- c("Sunday" = 1, "Monday" = 2, "Tuesday" = 3, "Wednesday" = 4, "Thursday" = 5, "Friday" = 6, "Saturday" = 7)
cleaned_taxi_df$Trip.Day.Of.The.Week <- as.integer(days_of_week[weekdays(cleaned_taxi_df$Trip.Start.Timestamp)])
cleaned_taxi_df$Trip.Day.Of.The.Week <- as.factor(cleaned_taxi_df$Trip.Day.Of.The.Week)
head(cleaned_taxi_df)
cleaned_taxi_df$Payment.Type <- as.factor(cleaned_taxi_df$Payment.Type)
cleaned_taxi_df$Company <- as.factor(cleaned_taxi_df$Company)
cleaned_taxi_df$Taxi.ID <- as.factor(cleaned_taxi_df$Taxi.ID)
head(cleaned_taxi_df)
cleaned_taxi_df$Pickup.Community.Area <- as.factor(cleaned_taxi_df$Pickup.Community.Area)
cleaned_taxi_df$Dropoff.Community.Area <- as.factor(cleaned_taxi_df$Dropoff.Community.Area)
dim(cleaned_taxi_df)
## [1] 379024 17
names(cleaned_taxi_df)
## [1] "Trip.ID" "Taxi.ID" "Trip.Start.Timestamp"
## [4] "Trip.Miles" "Pickup.Community.Area" "Dropoff.Community.Area"
## [7] "Fare" "Tips" "Tolls"
## [10] "Extras" "Trip.Total" "Payment.Type"
## [13] "Company" "Trip.Minutes" "Trip.Start.Date"
## [16] "Trip.Hour.Of.The.Day" "Trip.Day.Of.The.Week"
summary(cleaned_taxi_df)
## Trip.ID
## Length:379024
## Class :character
## Mode :character
##
##
##
##
## Taxi.ID
## d40dae7ea46d61abca67eb53b157fe9cf0b485cca6dce122604588a69aa6c4b6b78e0e5c5fd11f9702babd94016122df1d328a459c8b7de2cb37a1bad947b1fe: 828
## 2780ead18beaa862cc67315ddabd9d1acaadcd6da82eba38b064d7d6f4acc260b68ef1ae3ce06dad845178107940b3493fa99640f0f70c25d15cf57336ab7b8f: 739
## abd1ffa32433ceabeb49f4461015b38ddc252847ed3a29320aee6af650ba1e927195d191bf191f4f6f329ad7512a3f0f8e43ea844f3ead6f7c50fc4f0ccff08a: 637
## 37073e8c9e454886fe4a916f80a9a3478570e7dd3e663f40c5b81eae90f8f611027c67455f43b426f4c34dcb7fdb6697c82a3c6d00237f11a4a6cf5b1d1ce0c7: 635
## 13016372e777da1289d557edbe4ce2be8a68e77bc64768acaf5e0539b10be2ca089238dc27408b49b17899014e6e178e17c3ba455812fd84024f93e266324439: 633
## 8da9e1d18757022c6a6a614fc2d38483e38aae441feff500095a83ebc68006cf88329f2c28e35ba92ead14037739f9971a8a2852f946ebc59d0160c4f1104ec8: 630
## (Other) :374922
## Trip.Start.Timestamp Trip.Miles Pickup.Community.Area
## Min. :2024-01-01 00:00:00.00 Min. : 0.000 8 :82654
## 1st Qu.:2024-01-09 20:45:00.00 1st Qu.: 0.880 76 :72260
## Median :2024-01-18 10:00:00.00 Median : 2.570 32 :63444
## Mean :2024-01-17 10:42:18.18 Mean : 6.085 28 :42272
## 3rd Qu.:2024-01-24 16:30:00.00 3rd Qu.: 11.270 6 :12880
## Max. :2024-02-01 00:00:00.00 Max. :664.900 56 :12047
## (Other):93467
## Dropoff.Community.Area Fare Tips Tolls
## 8 : 92741 Min. : 0.00 Min. : 0.000 Min. : 0.000
## 32 : 64694 1st Qu.: 7.75 1st Qu.: 0.000 1st Qu.: 0.000
## 28 : 40248 Median : 14.00 Median : 0.040 Median : 0.000
## 6 : 19276 Mean : 20.72 Mean : 2.592 Mean : 0.054
## 76 : 18269 3rd Qu.: 32.25 3rd Qu.: 3.700 3rd Qu.: 0.000
## 7 : 16568 Max. :1525.00 Max. :200.000 Max. :4444.440
## (Other):127228
## Extras Trip.Total Payment.Type
## Min. : 0.000 Min. : 0.00 Cash :108481
## 1st Qu.: 0.000 1st Qu.: 9.75 Credit Card:136043
## Median : 0.000 Median : 16.50 Dispute : 102
## Mean : 1.327 Mean : 24.86 Mobile : 62672
## 3rd Qu.: 1.000 3rd Qu.: 36.30 No Charge : 244
## Max. :5051.100 Max. :8912.13 Prcard : 50556
## Unknown : 20926
## Company Trip.Minutes Trip.Start.Date
## Flash Cab :86779 Min. : 0.00 Min. :2024-01-01
## Taxi Affiliation Services :71541 1st Qu.: 7.28 1st Qu.:2024-01-10
## Sun Taxi :40352 Median : 14.23 Median :2024-01-18
## Taxicab Insurance Agency Llc:39522 Mean : 18.48 Mean :2024-01-17
## City Service :35399 3rd Qu.: 25.72 3rd Qu.:2024-01-24
## Chicago Independents :21779 Max. :1435.58 Max. :2024-02-01
## (Other) :83652
## Trip.Hour.Of.The.Day Trip.Day.Of.The.Week
## 17 : 28075 1:34446
## 16 : 27115 2:59096
## 15 : 26222 3:67076
## 18 : 25558 4:72588
## 14 : 25184 5:59096
## 13 : 24860 6:50350
## (Other):222010 7:36372
head(cleaned_taxi_df)
saveRDS(cleaned_taxi_df, "taxi_df_data_cleaning.rds")
summary(cleaned_taxi_df)
## Trip.ID
## Length:379024
## Class :character
## Mode :character
##
##
##
##
## Taxi.ID
## d40dae7ea46d61abca67eb53b157fe9cf0b485cca6dce122604588a69aa6c4b6b78e0e5c5fd11f9702babd94016122df1d328a459c8b7de2cb37a1bad947b1fe: 828
## 2780ead18beaa862cc67315ddabd9d1acaadcd6da82eba38b064d7d6f4acc260b68ef1ae3ce06dad845178107940b3493fa99640f0f70c25d15cf57336ab7b8f: 739
## abd1ffa32433ceabeb49f4461015b38ddc252847ed3a29320aee6af650ba1e927195d191bf191f4f6f329ad7512a3f0f8e43ea844f3ead6f7c50fc4f0ccff08a: 637
## 37073e8c9e454886fe4a916f80a9a3478570e7dd3e663f40c5b81eae90f8f611027c67455f43b426f4c34dcb7fdb6697c82a3c6d00237f11a4a6cf5b1d1ce0c7: 635
## 13016372e777da1289d557edbe4ce2be8a68e77bc64768acaf5e0539b10be2ca089238dc27408b49b17899014e6e178e17c3ba455812fd84024f93e266324439: 633
## 8da9e1d18757022c6a6a614fc2d38483e38aae441feff500095a83ebc68006cf88329f2c28e35ba92ead14037739f9971a8a2852f946ebc59d0160c4f1104ec8: 630
## (Other) :374922
## Trip.Start.Timestamp Trip.Miles Pickup.Community.Area
## Min. :2024-01-01 00:00:00.00 Min. : 0.000 8 :82654
## 1st Qu.:2024-01-09 20:45:00.00 1st Qu.: 0.880 76 :72260
## Median :2024-01-18 10:00:00.00 Median : 2.570 32 :63444
## Mean :2024-01-17 10:42:18.18 Mean : 6.085 28 :42272
## 3rd Qu.:2024-01-24 16:30:00.00 3rd Qu.: 11.270 6 :12880
## Max. :2024-02-01 00:00:00.00 Max. :664.900 56 :12047
## (Other):93467
## Dropoff.Community.Area Fare Tips Tolls
## 8 : 92741 Min. : 0.00 Min. : 0.000 Min. : 0.000
## 32 : 64694 1st Qu.: 7.75 1st Qu.: 0.000 1st Qu.: 0.000
## 28 : 40248 Median : 14.00 Median : 0.040 Median : 0.000
## 6 : 19276 Mean : 20.72 Mean : 2.592 Mean : 0.054
## 76 : 18269 3rd Qu.: 32.25 3rd Qu.: 3.700 3rd Qu.: 0.000
## 7 : 16568 Max. :1525.00 Max. :200.000 Max. :4444.440
## (Other):127228
## Extras Trip.Total Payment.Type
## Min. : 0.000 Min. : 0.00 Cash :108481
## 1st Qu.: 0.000 1st Qu.: 9.75 Credit Card:136043
## Median : 0.000 Median : 16.50 Dispute : 102
## Mean : 1.327 Mean : 24.86 Mobile : 62672
## 3rd Qu.: 1.000 3rd Qu.: 36.30 No Charge : 244
## Max. :5051.100 Max. :8912.13 Prcard : 50556
## Unknown : 20926
## Company Trip.Minutes Trip.Start.Date
## Flash Cab :86779 Min. : 0.00 Min. :2024-01-01
## Taxi Affiliation Services :71541 1st Qu.: 7.28 1st Qu.:2024-01-10
## Sun Taxi :40352 Median : 14.23 Median :2024-01-18
## Taxicab Insurance Agency Llc:39522 Mean : 18.48 Mean :2024-01-17
## City Service :35399 3rd Qu.: 25.72 3rd Qu.:2024-01-24
## Chicago Independents :21779 Max. :1435.58 Max. :2024-02-01
## (Other) :83652
## Trip.Hour.Of.The.Day Trip.Day.Of.The.Week
## 17 : 28075 1:34446
## 16 : 27115 2:59096
## 15 : 26222 3:67076
## 18 : 25558 4:72588
## 14 : 25184 5:59096
## 13 : 24860 6:50350
## (Other):222010 7:36372
cleaned_taxi_df <- readRDS("taxi_df_data_cleaning.rds")
attach(cleaned_taxi_df)
summary(cleaned_taxi_df)
## Trip.ID
## Length:379024
## Class :character
## Mode :character
##
##
##
##
## Taxi.ID
## d40dae7ea46d61abca67eb53b157fe9cf0b485cca6dce122604588a69aa6c4b6b78e0e5c5fd11f9702babd94016122df1d328a459c8b7de2cb37a1bad947b1fe: 828
## 2780ead18beaa862cc67315ddabd9d1acaadcd6da82eba38b064d7d6f4acc260b68ef1ae3ce06dad845178107940b3493fa99640f0f70c25d15cf57336ab7b8f: 739
## abd1ffa32433ceabeb49f4461015b38ddc252847ed3a29320aee6af650ba1e927195d191bf191f4f6f329ad7512a3f0f8e43ea844f3ead6f7c50fc4f0ccff08a: 637
## 37073e8c9e454886fe4a916f80a9a3478570e7dd3e663f40c5b81eae90f8f611027c67455f43b426f4c34dcb7fdb6697c82a3c6d00237f11a4a6cf5b1d1ce0c7: 635
## 13016372e777da1289d557edbe4ce2be8a68e77bc64768acaf5e0539b10be2ca089238dc27408b49b17899014e6e178e17c3ba455812fd84024f93e266324439: 633
## 8da9e1d18757022c6a6a614fc2d38483e38aae441feff500095a83ebc68006cf88329f2c28e35ba92ead14037739f9971a8a2852f946ebc59d0160c4f1104ec8: 630
## (Other) :374922
## Trip.Start.Timestamp Trip.Miles Pickup.Community.Area
## Min. :2024-01-01 00:00:00.00 Min. : 0.000 8 :82654
## 1st Qu.:2024-01-09 20:45:00.00 1st Qu.: 0.880 76 :72260
## Median :2024-01-18 10:00:00.00 Median : 2.570 32 :63444
## Mean :2024-01-17 10:42:18.18 Mean : 6.085 28 :42272
## 3rd Qu.:2024-01-24 16:30:00.00 3rd Qu.: 11.270 6 :12880
## Max. :2024-02-01 00:00:00.00 Max. :664.900 56 :12047
## (Other):93467
## Dropoff.Community.Area Fare Tips Tolls
## 8 : 92741 Min. : 0.00 Min. : 0.000 Min. : 0.000
## 32 : 64694 1st Qu.: 7.75 1st Qu.: 0.000 1st Qu.: 0.000
## 28 : 40248 Median : 14.00 Median : 0.040 Median : 0.000
## 6 : 19276 Mean : 20.72 Mean : 2.592 Mean : 0.054
## 76 : 18269 3rd Qu.: 32.25 3rd Qu.: 3.700 3rd Qu.: 0.000
## 7 : 16568 Max. :1525.00 Max. :200.000 Max. :4444.440
## (Other):127228
## Extras Trip.Total Payment.Type
## Min. : 0.000 Min. : 0.00 Cash :108481
## 1st Qu.: 0.000 1st Qu.: 9.75 Credit Card:136043
## Median : 0.000 Median : 16.50 Dispute : 102
## Mean : 1.327 Mean : 24.86 Mobile : 62672
## 3rd Qu.: 1.000 3rd Qu.: 36.30 No Charge : 244
## Max. :5051.100 Max. :8912.13 Prcard : 50556
## Unknown : 20926
## Company Trip.Minutes Trip.Start.Date
## Flash Cab :86779 Min. : 0.00 Min. :2024-01-01
## Taxi Affiliation Services :71541 1st Qu.: 7.28 1st Qu.:2024-01-10
## Sun Taxi :40352 Median : 14.23 Median :2024-01-18
## Taxicab Insurance Agency Llc:39522 Mean : 18.48 Mean :2024-01-17
## City Service :35399 3rd Qu.: 25.72 3rd Qu.:2024-01-24
## Chicago Independents :21779 Max. :1435.58 Max. :2024-02-01
## (Other) :83652
## Trip.Hour.Of.The.Day Trip.Day.Of.The.Week
## 17 : 28075 1:34446
## 16 : 27115 2:59096
## 15 : 26222 3:67076
## 18 : 25558 4:72588
## 14 : 25184 5:59096
## 13 : 24860 6:50350
## (Other):222010 7:36372
dim(cleaned_taxi_df)
## [1] 379024 17
hist(Fare, breaks = 50, main = "Histogram of Total fare", xlab = "Total Fare")
summary(Fare)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 7.75 14.00 20.72 32.25 1525.00
Looking at the histogram and the boxplot, there are some extreme values that hinders accuracy of our future model, hence needs to be removed.
fare_related_features <- cleaned_taxi_df[, c('Fare','Tips','Tolls','Extras','Trip.Total')]
boxplot(fare_related_features)
# Define the function to detect outliers for a single column
is.outlier <- function(x) {
iqr <- IQR(x, na.rm = TRUE)
lower <- quantile(x, 0.25, na.rm = TRUE) - 1.5 * iqr
upper <- quantile(x, 0.75, na.rm = TRUE) + 1.5 * iqr
return(x < lower | x > upper)
}
outliers <- is.outlier(Trip.Total)
number of outliers
# Define the function to detect outliers for a single column
is.outlier <- function(x) {
iqr <- IQR(x, na.rm = TRUE)
lower <- quantile(x, 0.25, na.rm = TRUE) - 1.5 * iqr
upper <- quantile(x, 0.75, na.rm = TRUE) + 1.5 * iqr
return(x < lower | x > upper)
}
# List of numerical columns to check for outliers
numerical_cols <- c("Fare", "Tips", "Tolls", "Extras", "Trip.Total", "Trip.Minutes", "Trip.Miles")
# Initialize a logical vector to store the rows to keep (no outliers)
rows_to_keep <- rep(TRUE, nrow(cleaned_taxi_df))
# Initialize a vector to store the count of outliers for each feature
outlier_counts <- numeric(length(numerical_cols))
names(outlier_counts) <- numerical_cols
# Loop through each numerical column
for (col in numerical_cols) {
# Find the outliers in the column
outliers <- is.outlier(cleaned_taxi_df[[col]])
# Store the count of outliers for the feature
outlier_counts[col] <- sum(outliers)
rows_to_keep <- rows_to_keep & !outliers
}
cleaned_taxi_df <- cleaned_taxi_df[rows_to_keep, ]
# Print the number of outliers for each feature
print(outlier_counts)
## Fare Tips Tolls Extras Trip.Total Trip.Minutes
## 2439 34169 534 82339 3889 9271
## Trip.Miles
## 1603
dim(cleaned_taxi_df)
## [1] 285958 17
distance_features <- cleaned_taxi_df[, c('Trip.Miles')]
boxplot(distance_features)
hist(cleaned_taxi_df$Fare, breaks = 25, main = "Histogram of Fare", xlab = "Fare")
summary(cleaned_taxi_df$Fare)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 6.75 10.00 14.83 21.50 69.00
hist(cleaned_taxi_df$Tips, breaks = 50, main = "Histogram of Tips", xlab = "Tips")
summary(cleaned_taxi_df$Tips)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 1.235 2.220 9.250
hist(cleaned_taxi_df$Tolls, breaks = 50, main = "Histogram of Tolls", xlab = "Tolls")
summary(cleaned_taxi_df$Tolls)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
hist(cleaned_taxi_df$Extras, breaks = 50, main = "Histogram of Extras", xlab = "Extras")
summary(cleaned_taxi_df$Extras)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1357 0.0000 2.5000
hist(cleaned_taxi_df$Trip.Total, breaks = 25, main = "Histogram of Trip.Total", xlab = "Trip.Total")
summary(cleaned_taxi_df$Trip.Total)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 8.40 12.00 16.33 23.25 75.50
hist(cleaned_taxi_df$Trip.Miles, breaks = 25, main = "Histogram of Trip.Miles", xlab = "Trip.Miles")
summary(cleaned_taxi_df$Trip.Miles)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.720 1.680 3.783 5.740 26.770
boxplot(cleaned_taxi_df[, c('Fare','Tips','Tolls','Extras','Trip.Total')])
boxplot(cleaned_taxi_df[, c('Trip.Miles')])
# Select only numeric columns for correlation
numeric_columns <- sapply(cleaned_taxi_df, is.numeric)
cor_matrix <- cor(cleaned_taxi_df[, numeric_columns], use = "complete.obs")
## Warning in cor(cleaned_taxi_df[, numeric_columns], use = "complete.obs"): the
## standard deviation is zero
print(cor_matrix)
## Trip.Miles Fare Tips Tolls Extras Trip.Total
## Trip.Miles 1.00000000 0.83320594 0.02935890 NA -0.08140904 0.81726238
## Fare 0.83320594 1.00000000 0.06029796 NA -0.10636693 0.98411311
## Tips 0.02935890 0.06029796 1.00000000 NA 0.05050705 0.23003575
## Tolls NA NA NA 1 NA NA
## Extras -0.08140904 -0.10636693 0.05050705 NA 1.00000000 -0.05773196
## Trip.Total 0.81726238 0.98411311 0.23003575 NA -0.05773196 1.00000000
## Trip.Minutes 0.70885089 0.75850279 0.01635369 NA -0.06630001 0.74146775
## Trip.Minutes
## Trip.Miles 0.70885089
## Fare 0.75850279
## Tips 0.01635369
## Tolls NA
## Extras -0.06630001
## Trip.Total 0.74146775
## Trip.Minutes 1.00000000
cor_data <- as.data.frame(as.table(cor_matrix))
ggplot(cor_data, aes(Var1, Var2, fill = Freq)) +
geom_tile() + # This creates the heatmap boxes
geom_text(aes(label = sprintf("%.2f", Freq)), vjust = 1, color = "black", size = 3) + # Adjust size as needed
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1, 1)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggtitle("Correlation Matrix Heatmap")
Trip.Total is in high correlation relationship with ‘Fare, Trip.Miles, Tips’, but almost no relationship with ‘Tolls, Hour of the trip’
# Generate a sequence of dates within the range of your data
date_range <- seq(min(cleaned_taxi_df$Trip.Start.Date), max(cleaned_taxi_df$Trip.Start.Date), by = "day")
# Create a data frame of weekend dates
weekend_dates <- data.frame(Date = date_range[weekdays(date_range) %in% c("Saturday", "Sunday")])
# Group by Taxi ID and Date, then summarize the average trips
average_trips_per_taxi <- cleaned_taxi_df %>%
group_by(Taxi.ID, Trip.Start.Date) %>%
summarise(Trips = n(), .groups = 'drop') %>%
group_by(Trip.Start.Date) %>%
summarise(AvgTrips = mean(Trips), .groups = 'drop')
# Plot the average trips per taxi over time
plot <- ggplot(average_trips_per_taxi, aes(x = Trip.Start.Date, y = AvgTrips)) +
geom_line() +
labs(title = "Average Number of Trips per Taxi in a Day Over Time",
x = "Date",
y = " ") +
theme_minimal()
# Add vertical lines for weekends
plot + geom_vline(data = weekend_dates, aes(xintercept = as.numeric(Date)), color = "red", linetype = "dashed")
label_thousands <- function(x) {
paste0(x / 1000, "k")
}
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
## The following objects are masked from 'package:formattable':
##
## comma, percent, scientific
# First, calculate the total trips per day
total_trips_over_time <- cleaned_taxi_df %>%
group_by(Trip.Start.Date) %>%
summarise(TotalTrips = n(), .groups = 'drop')
total_trips_plot <- ggplot(total_trips_over_time, aes(x = Trip.Start.Date, y = TotalTrips)) +
geom_line() +
scale_y_continuous(labels = label_thousands) +
labs(title = "Total Trips Over Time",
x = "Date",
y = " ") +
theme_minimal()
# Assuming weekend_dates is a vector that contains the dates of weekends
total_trips_plot <- total_trips_plot +
geom_vline(data = data.frame(Date = weekend_dates), aes(xintercept = as.numeric(Date)),
color = "red", linetype = "dashed")
print(total_trips_plot)
We can see that people tends not to use taxi on weekends than weekdays.
# Create a long format data frame for pickup and dropoff areas
area_data <- tidyr::pivot_longer(
cleaned_taxi_df,
cols = c("Pickup.Community.Area", "Dropoff.Community.Area"),
names_to = "AreaType",
values_to = "Area"
)
# Plot overlaid bar charts for pickup and dropoff area counts
ggplot(area_data, aes(x = as.factor(Area), fill = AreaType)) +
geom_bar(position = "identity", alpha = 0.5) + # Set alpha for transparency
scale_y_continuous(labels = label_thousands) + # Format y-axis labels
labs(title = "Overlaid Bar Chart of Pickup and Dropoff Area Counts",
x = "Area Code",
y = " ") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_fill_manual(values = c("Pickup.Community.Area" = "blue", "Dropoff.Community.Area" = "red"))
# Define a custom label function for y axis to display in millions
label_millions <- function(x) {
paste0(formatC(x / 1e6, format = "f", digits = 1), "M")
}
# Summarise Trip.Total by Payment Type
trip_total_by_PaymentType <- cleaned_taxi_df %>%
group_by(Payment.Type) %>%
summarise(TotalRevenue = sum(Trip.Total), .groups = 'drop')
# Create a bar plot of Trip Total by Payment Type
ggplot(trip_total_by_PaymentType, aes(x = Payment.Type, y = TotalRevenue, fill = Payment.Type)) +
geom_col() + # This creates a bar chart with pre-summarized data
scale_y_continuous(labels = label_millions) + # Use the custom label function for millions
labs(title = "Taxi Fare by Payment Type",
x = " ",
y = " ") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
trip_total_by_DayoftheWeek <- cleaned_taxi_df %>%
group_by(Trip.Day.Of.The.Week) %>%
summarise(TotalRevenue = sum(Trip.Total), .groups = 'drop')
# Define a named vector to map day numbers to day names
day_names <- c("1" = "Sunday", "2" = "Monday", "3" = "Tuesday", "4" = "Wednesday",
"5" = "Thursday", "6" = "Friday", "7" = "Saturday")
# Create the bar plot, using the named vector for axis labels
ggplot(trip_total_by_DayoftheWeek, aes(x = Trip.Day.Of.The.Week, y = TotalRevenue, fill = Trip.Day.Of.The.Week)) +
geom_col() +
scale_x_discrete(labels = day_names) + # Use the day_names vector for axis labels
scale_y_continuous(labels = label_millions) +
labs(title = "Taxi fare by Day of the Week",
x = " ",
y = " ") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
trip_total_by_Company <- cleaned_taxi_df %>%
group_by(Company) %>%
summarise(TotalRevenue = sum(Trip.Total) / 1e6, .groups = 'drop') %>%
arrange(desc(TotalRevenue))
kable(trip_total_by_Company, digits = 2, col.names = c("Company", "Total Revenue (Millions $)"))
| Company | Total Revenue (Millions $) |
|---|---|
| Flash Cab | 1.51 |
| Taxi Affiliation Services | 0.86 |
| Sun Taxi | 0.42 |
| City Service | 0.42 |
| Taxicab Insurance Agency Llc | 0.37 |
| Chicago Independents | 0.20 |
| 5 Star Taxi | 0.19 |
| Globe Taxi | 0.16 |
| Blue Ribbon Taxi Association | 0.15 |
| Medallion Leasin | 0.12 |
| Choice Taxi Association | 0.05 |
| Taxicab Insurance Agency, LLC | 0.04 |
| Choice Taxi Association Inc | 0.04 |
| Chicago City Taxi Association | 0.03 |
| U Taxicab | 0.02 |
| Top Cab | 0.01 |
| Koam Taxi Association | 0.01 |
| Taxi Affiliation Services Llc - Yell | 0.01 |
| Patriot Taxi Dba Peace Taxi Associat | 0.01 |
| Star North Taxi Management Llc | 0.01 |
| Chicago Taxicab | 0.01 |
| Metro Jet Taxi A. | 0.00 |
| 3591 - 63480 Chuks Cab | 0.00 |
| Tac - Yellow Cab Association | 0.00 |
| Setare Inc | 0.00 |
| 3556 - 36214 RC Andrews Cab | 0.00 |
| 5167 - 71969 5167 Taxi Inc | 0.00 |
| Tac - Checker Cab Dispatch | 0.00 |
| 312 Medallion Management Corp | 0.00 |
| 6574 - Babylon Express Inc. | 0.00 |
| Petani Cab Corp | 0.00 |
| 2733 - 74600 Benny Jona | 0.00 |
| 4053 - 40193 Adwar H. Nikola | 0.00 |
# Summarise Trip.Total by Company and arrange by TotalRevenue in descending order
trip_total_by_time <- cleaned_taxi_df %>%
group_by(Trip.Hour.Of.The.Day) %>%
summarise(TotalRevenue = sum(Trip.Total), .groups = 'drop') %>%
arrange(desc(TotalRevenue))
ggplot(trip_total_by_time, aes(x = Trip.Hour.Of.The.Day, y = TotalRevenue, fill = TotalRevenue)) +
geom_col() +
scale_y_continuous(labels = label_thousands) +
labs(title = "Taxi fare by time in a day",
x = "Time",
y = " ") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none") +
scale_fill_gradient(low = "lightblue", high = "blue")
# Summarise Trip id by Payment Type
trip_count_by_PaymentType <- cleaned_taxi_df %>%
group_by(Payment.Type) %>%
summarise(TripCount = n_distinct(Trip.ID))
# Create a bar plot of Trip Total by Payment Type
ggplot(trip_count_by_PaymentType, aes(x = Payment.Type, y = TripCount, fill = Payment.Type)) +
geom_col() +
scale_y_continuous(labels = label_thousands) +
labs(title = "Trips Count by Paymeny type",
x = " ",
y = " ") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
trip_count_by_DayoftheWeek <- cleaned_taxi_df %>%
group_by(Trip.Day.Of.The.Week) %>%
summarise(TripCount = n_distinct(Trip.ID))
# Define a named vector to map day numbers to day names
day_names <- c("1" = "Sunday", "2" = "Monday", "3" = "Tuesday", "4" = "Wednesday",
"5" = "Thursday", "6" = "Friday", "7" = "Saturday")
# Create the bar plot, using the named vector for axis labels
ggplot(trip_count_by_DayoftheWeek, aes(x = Trip.Day.Of.The.Week, y = TripCount, fill = Trip.Day.Of.The.Week)) +
geom_col() +
scale_x_discrete(labels = day_names) +
scale_y_continuous(labels = label_thousands) +
labs(title = "Trips Count by Day of the Week",
x = " ",
y = " ") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
# Summarise Trip.Total by Company and arrange by TotalRevenue in descending order
trip_count_by_Company <- cleaned_taxi_df %>%
group_by(Company) %>%
summarise(TripCount = n_distinct(Trip.ID) / 1e3 , .groups = 'drop') %>%
arrange(desc(TripCount))
# Display the full table sorted by Total Revenue
kable(trip_count_by_Company, digits=2, col.names = c("Company", "Trips Count (Thousands $)"))
| Company | Trips Count (Thousands $) |
|---|---|
| Flash Cab | 75.98 |
| Taxi Affiliation Services | 53.39 |
| Sun Taxi | 28.14 |
| City Service | 27.04 |
| Taxicab Insurance Agency Llc | 26.32 |
| Chicago Independents | 14.53 |
| 5 Star Taxi | 11.10 |
| Globe Taxi | 10.76 |
| Blue Ribbon Taxi Association | 10.47 |
| Medallion Leasin | 8.28 |
| Taxicab Insurance Agency, LLC | 3.88 |
| Choice Taxi Association | 3.08 |
| Chicago City Taxi Association | 3.06 |
| Choice Taxi Association Inc | 2.38 |
| U Taxicab | 1.64 |
| Top Cab | 1.26 |
| Koam Taxi Association | 1.03 |
| Taxi Affiliation Services Llc - Yell | 0.73 |
| Patriot Taxi Dba Peace Taxi Associat | 0.69 |
| Star North Taxi Management Llc | 0.44 |
| Chicago Taxicab | 0.41 |
| 3591 - 63480 Chuks Cab | 0.32 |
| Setare Inc | 0.25 |
| Metro Jet Taxi A. | 0.24 |
| Tac - Yellow Cab Association | 0.18 |
| 3556 - 36214 RC Andrews Cab | 0.09 |
| 5167 - 71969 5167 Taxi Inc | 0.09 |
| 312 Medallion Management Corp | 0.07 |
| Tac - Checker Cab Dispatch | 0.06 |
| 6574 - Babylon Express Inc. | 0.03 |
| Petani Cab Corp | 0.00 |
| 2733 - 74600 Benny Jona | 0.00 |
| 4053 - 40193 Adwar H. Nikola | 0.00 |
# Summarise Trip.Total by Company and arrange by TotalRevenue in descending order
trip_count_by_time <- cleaned_taxi_df %>%
group_by(Trip.Hour.Of.The.Day) %>%
summarise(TripCount = n_distinct(Trip.ID), .groups = 'drop') %>%
arrange(desc(TripCount))
ggplot(trip_count_by_time, aes(x = Trip.Hour.Of.The.Day, y = TripCount, fill = TripCount)) +
geom_col() +
scale_y_continuous(labels = label_thousands) +
labs(title = "Trips Count by time in a day",
x = "Time",
y = " ") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none") +
scale_fill_gradient(low = "lightblue", high = "blue")
detach(cleaned_taxi_df)
saveRDS(cleaned_taxi_df, "taxi_df_EDA.rds")
Some Key Findings from EDA
Total taxi fare correlates with original fare, trip miles, tips, where does not correlates with tolls and trip hours of the day.
People tends to use taxi on weekdays (especially on Wed > Tue > Mon), but not on weekends.
Majority of people use Credit Card then Cash to pay the taxi fare.
The most popular Pickup area are ‘8, 76, 32, 28’.
The most popular Dropoff area are ‘8, 32, 28’.
Top 5 most frequently used taxi companies in Chicago are Flash Cab, Taxi Affiliation Services, Sun Taxi, Taxicab Insurance Agency Llc, and City Service.
The peak time for using taxi is 4pm~5pm. But by the taxi fare, the peak time is 3pm~7pm.
cleaned_taxi_df <- readRDS("taxi_df_EDA.rds")
features_to_drop <- c('Trip.ID', 'Taxi.ID', 'Trip.Start.Timestamp', 'Trip.Start.Date', 'Payment.Type', 'Company', 'Tips', 'Tolls', 'Extras', 'Trip.Total')
selected_taxi_df <- cleaned_taxi_df %>% select(-one_of(features_to_drop))
factor_columns <- sapply(selected_taxi_df, is.factor)
selected_taxi_df[factor_columns] <- sapply(selected_taxi_df[factor_columns], function(x) as.numeric(levels(x))[x])
head(selected_taxi_df)
set.seed(123)
sample_size <- floor(0.5 * nrow(selected_taxi_df))
sample_indices <- sample(seq_len(nrow(selected_taxi_df)), size=sample_size)
random_forest <- randomForest(Fare ~ ., data = selected_taxi_df[sample_indices, ], importance = TRUE, ntree = 25)
importance(random_forest)
## %IncMSE IncNodePurity
## Trip.Miles 28.697780 7523341.9
## Pickup.Community.Area 10.293868 1318231.5
## Dropoff.Community.Area 11.591969 1421921.2
## Trip.Minutes 26.346702 5572350.4
## Trip.Hour.Of.The.Day 22.412510 440651.3
## Trip.Day.Of.The.Week 3.587485 210386.1
regfit.full <- regsubsets(Fare ~ ., data = selected_taxi_df)
reg.summary <- summary(regfit.full)
print(reg.summary)
## Subset selection object
## Call: regsubsets.formula(Fare ~ ., data = selected_taxi_df)
## 6 Variables (and intercept)
## Forced in Forced out
## Trip.Miles FALSE FALSE
## Pickup.Community.Area FALSE FALSE
## Dropoff.Community.Area FALSE FALSE
## Trip.Minutes FALSE FALSE
## Trip.Hour.Of.The.Day FALSE FALSE
## Trip.Day.Of.The.Week FALSE FALSE
## 1 subsets of each size up to 6
## Selection Algorithm: exhaustive
## Trip.Miles Pickup.Community.Area Dropoff.Community.Area Trip.Minutes
## 1 ( 1 ) "*" " " " " " "
## 2 ( 1 ) "*" " " " " "*"
## 3 ( 1 ) "*" " " "*" "*"
## 4 ( 1 ) "*" "*" "*" "*"
## 5 ( 1 ) "*" "*" "*" "*"
## 6 ( 1 ) "*" "*" "*" "*"
## Trip.Hour.Of.The.Day Trip.Day.Of.The.Week
## 1 ( 1 ) " " " "
## 2 ( 1 ) " " " "
## 3 ( 1 ) " " " "
## 4 ( 1 ) " " " "
## 5 ( 1 ) " " "*"
## 6 ( 1 ) "*" "*"
highest_point <- which.max(reg.summary$adjr2)
plot(reg.summary$adjr2, xlab = "Number of Variables",
ylab = "Adjusted RSq", type = "l")
points(highest_point, reg.summary$adjr2[highest_point], col = "red", cex = 2,
pch = 20)
plot(reg.summary$cp, xlab = "Number of Variables", ylab = "Cp", type = "l")
lowest_point <- which.min(reg.summary$cp)
points(lowest_point, reg.summary$cp[lowest_point], col = "red", cex = 2, pch = 20)
As we see from the Adjusted R2 and Cp graphs, we would have the best performance on using all the 6 features. The individual importance of these features was calculated using the random forests where we found that Trip.Miles & Trip.Hour.Of.The.Day having the highest % Increase in MSE if not included while Trip.Miles & Trip.Minutes contributed the most to the increases in Node purities of the decision trees when used to split.
set.seed(123)
trainIndex <- createDataPartition(selected_taxi_df$Fare, p = 0.6, list = FALSE)
train_set <- selected_taxi_df[trainIndex,]
test_set <- selected_taxi_df[-trainIndex,]
cat("Training:", dim(train_set), "Testing:", dim(test_set))
## Training: 171577 7 Testing: 114381 7
x_train <- model.matrix(Fare ~ .-1, data = train_set)
y_train <- train_set$Fare
x_test <- model.matrix(Fare ~ .-1, data = test_set)
y_test <- test_set$Fare
grid <- 10^seq(10, -2, length = 100)
cv_ridge_fit <- cv.glmnet(x_train, y_train, alpha = 0, lambda = grid)
cat("Min value for lambda(Ridge):", cv_ridge_fit$lambda.min)
## Min value for lambda(Ridge): 0.01
plot(cv_ridge_fit)
title("Cross-Validation Lambda value for Ridge", line = 2.5)
grid <- 10^seq(10, -2, length = 100)
cv_lasso_fit <- cv.glmnet(x_train, y_train, alpha = 1, lambda = grid)
cat("Min value for lambda(lasso):", cv_lasso_fit$lambda.min)
## Min value for lambda(lasso): 0.01
plot(cv_lasso_fit)
title("Cross-Validation Lambda value for lasso", line = 2.5)
lasso_model <- glmnet(x_train, y_train, alpha = 1, lambda = cv_lasso_fit$lambda.min)
ridge_model <- glmnet(x_train, y_train, alpha = 0, lambda = cv_ridge_fit$lambda.min)
predicted_fare_lasso <- predict(lasso_model, newx = x_test)
predicted_fare_ridge <- predict(ridge_model, newx = x_test)
mse_lasso <- mean((y_test - predicted_fare_lasso)^2)
print(paste("Lasso Model - Mean Squared Error (MSE):", mse_lasso))
## [1] "Lasso Model - Mean Squared Error (MSE): 29.1224046335354"
mse_ridge <- mean((y_test - predicted_fare_ridge)^2)
print(paste("Ridge Model - Mean Squared Error (MSE):", mse_ridge))
## [1] "Ridge Model - Mean Squared Error (MSE): 29.1226404014018"
mae_lasso <- mean(abs(y_test - predicted_fare_lasso))
print(paste("Lasso Model - Mean Absolute Error (mae):", mae_lasso))
## [1] "Lasso Model - Mean Absolute Error (mae): 2.80794126799152"
mae_ridge <- mean(abs(y_test - predicted_fare_ridge))
print(paste("Ridge Model - Mean Absolute Error (mae):", mae_ridge))
## [1] "Ridge Model - Mean Absolute Error (mae): 2.80851565885102"
df_lasso <- data.frame(Actual = y_test, Prediction = c(predicted_fare_lasso))
ggplot(df_lasso, aes(x = Actual, y = Prediction)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = "red") +
ggtitle("Lasso: Predicted vs Actual Values") +
xlab("Actual Values") +
ylab("Predicted Values")
df_ridge <- data.frame(Actual = y_test, Prediction = c(predicted_fare_ridge))
ggplot(df_ridge, aes(x = Actual, y = Prediction)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = "red") +
ggtitle("Ridge: Predicted vs Actual Values") +
xlab("Actual Values") +
ylab("Predicted Values")
smooth_spline <- lm(Fare ~ bs(Trip.Miles) + bs(Pickup.Community.Area) + bs(Dropoff.Community.Area) + bs(Trip.Minutes) + bs(Trip.Hour.Of.The.Day) + bs(Trip.Day.Of.The.Week), data = train_set)
summary(smooth_spline)
##
## Call:
## lm(formula = Fare ~ bs(Trip.Miles) + bs(Pickup.Community.Area) +
## bs(Dropoff.Community.Area) + bs(Trip.Minutes) + bs(Trip.Hour.Of.The.Day) +
## bs(Trip.Day.Of.The.Week), data = train_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -53.287 -1.955 -0.946 0.800 62.650
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.59614 0.10628 99.702 < 2e-16 ***
## bs(Trip.Miles)1 -8.40632 0.17624 -47.699 < 2e-16 ***
## bs(Trip.Miles)2 42.10012 0.29904 140.784 < 2e-16 ***
## bs(Trip.Miles)3 25.07331 0.44035 56.939 < 2e-16 ***
## bs(Pickup.Community.Area)1 -4.86976 0.17587 -27.690 < 2e-16 ***
## bs(Pickup.Community.Area)2 4.35223 0.15593 27.912 < 2e-16 ***
## bs(Pickup.Community.Area)3 0.38478 0.08298 4.637 3.54e-06 ***
## bs(Dropoff.Community.Area)1 -3.22463 0.16531 -19.506 < 2e-16 ***
## bs(Dropoff.Community.Area)2 3.62377 0.14404 25.158 < 2e-16 ***
## bs(Dropoff.Community.Area)3 0.78060 0.07482 10.433 < 2e-16 ***
## bs(Trip.Minutes)1 7.00863 0.15853 44.209 < 2e-16 ***
## bs(Trip.Minutes)2 25.38679 0.17381 146.060 < 2e-16 ***
## bs(Trip.Minutes)3 13.28938 0.17015 78.102 < 2e-16 ***
## bs(Trip.Hour.Of.The.Day)1 -2.73170 0.17948 -15.220 < 2e-16 ***
## bs(Trip.Hour.Of.The.Day)2 -5.33725 0.10511 -50.778 < 2e-16 ***
## bs(Trip.Hour.Of.The.Day)3 -0.89259 0.10933 -8.164 3.25e-16 ***
## bs(Trip.Day.Of.The.Week)1 -1.54938 0.11248 -13.775 < 2e-16 ***
## bs(Trip.Day.Of.The.Week)2 -1.29344 0.07933 -16.304 < 2e-16 ***
## bs(Trip.Day.Of.The.Week)3 -0.80240 0.05797 -13.840 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.999 on 171558 degrees of freedom
## Multiple R-squared: 0.7937, Adjusted R-squared: 0.7937
## F-statistic: 3.667e+04 on 18 and 171558 DF, p-value: < 2.2e-16
predicted_fare <- predict(smooth_spline, newdata = test_set)
mse <- mean((test_set$Fare - predicted_fare)^2)
print(paste("Mean Squared Error (MSE):", mse))
## [1] "Mean Squared Error (MSE): 24.8368407582032"
mae <- mean(abs(test_set$Fare - predicted_fare))
print(paste("Mean Absolute Error (MAE):", mae))
## [1] "Mean Absolute Error (MAE): 2.59340833446686"
rmse <- sqrt(mse)
print(paste("Root Mean Squared Error (RMSE):", rmse))
## [1] "Root Mean Squared Error (RMSE): 4.98365736765713"
train_predicted_fare <- predict(smooth_spline, newdata = train_set)
residuals <- residuals(smooth_spline)
plot_data <- data.frame(Predicted = train_predicted_fare, Residuals = residuals)
ggplot(plot_data, aes(x = Predicted, y = Residuals)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
theme_minimal() +
labs(title = "Residual Plot", x = "Predicted Fare", y = "Residuals")
df_splines <- data.frame(Actual = test_set$Fare, Predicted = predicted_fare)
ggplot(df_splines, aes(x = Actual, y = Predicted)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = "red") +
ggtitle("Smoothing Splines: Predicted vs Actual Values") +
xlab("Actual Values") +
ylab("Predicted Values")
As we see from the test set results, smoothing splines with an error of (+,-)$3.5 perform better than the linear models of lasso & ridge with an error of (+,-)$3.9. But the residual analysis of the smoothing splines show that the residuals might be following a particular pattern as they all fall in a certain region.
features_to_drop <- c('Trip.ID', 'Taxi.ID', 'Trip.Start.Timestamp', 'Trip.Start.Date', 'Payment.Type', 'Company', 'Tolls', 'Extras', 'Trip.Total')
selected_taxi_df <- cleaned_taxi_df %>% select(-one_of(features_to_drop))
factor_columns <- sapply(selected_taxi_df, is.factor)
selected_taxi_df[factor_columns] <- sapply(selected_taxi_df[factor_columns], function(x) as.numeric(levels(x))[x])
summary(selected_taxi_df$Tips)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 1.235 2.220 9.250
model <- lm(Tips ~ ., data = selected_taxi_df)
summary(model)
##
## Call:
## lm(formula = Tips ~ ., data = selected_taxi_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2715 -1.3258 -0.9724 0.9940 8.7408
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.3111689 0.0139370 94.078 < 2e-16 ***
## Trip.Miles -0.0211006 0.0013904 -15.176 < 2e-16 ***
## Pickup.Community.Area -0.0150702 0.0001931 -78.042 < 2e-16 ***
## Dropoff.Community.Area 0.0029834 0.0001862 16.019 < 2e-16 ***
## Fare 0.0287299 0.0006143 46.769 < 2e-16 ***
## Trip.Minutes -0.0123958 0.0005148 -24.079 < 2e-16 ***
## Trip.Hour.Of.The.Day 0.0019003 0.0006887 2.759 0.00579 **
## Trip.Day.Of.The.Week 0.0038429 0.0019172 2.004 0.04503 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.774 on 285950 degrees of freedom
## Multiple R-squared: 0.02729, Adjusted R-squared: 0.02727
## F-statistic: 1146 on 7 and 285950 DF, p-value: < 2.2e-16
# Plot for Trip.Miles vs Tips
ggplot(selected_taxi_df, aes(x = Trip.Miles, y = Tips)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE) + # Linear regression line
labs(title = "Trip Miles vs Tips", x = "Trip Miles", y = "Tips")
# Plot for Pickup.Community.Area vs Tips
ggplot(selected_taxi_df, aes(x = Pickup.Community.Area, y = Tips)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +
labs(title = "Pickup Community Area vs Tips", x = "Pickup Community Area", y = "Tips")
# Plot for Dropoff.Community.Area vs Tips
ggplot(selected_taxi_df, aes(x = Dropoff.Community.Area, y = Tips)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +
labs(title = "Dropoff Community Area vs Tips", x = "Dropoff Community Area", y = "Tips")
# Plot for Fare vs Tips
ggplot(selected_taxi_df, aes(x = Fare, y = Tips)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +
labs(title = "Fare vs Tips", x = "Fare", y = "Tips")
# Plot for Trip.Minutes vs Tips
ggplot(selected_taxi_df, aes(x = Trip.Minutes, y = Tips)) +
geom_point() +
geom_smooth(method = "lm",formula = y ~ x, se = FALSE) +
labs(title = "Trip Minutes vs Tips", x = "Trip Minutes", y = "Tips")
# Plot for Trip.Hour.Of.The.Day vs Tips
ggplot(selected_taxi_df, aes(x = Trip.Hour.Of.The.Day, y = Tips)) +
geom_point() +
geom_smooth(method = "lm",formula = y ~ x, se = FALSE) +
labs(title = "Trip Hour Of The Day vs Tips", x = "Trip Hour Of The Day", y = "Tips")
# Plot for Trip.Day.Of.The.Week vs Tips
ggplot(selected_taxi_df, aes(x = Trip.Day.Of.The.Week, y = Tips)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +
labs(title = "Trip Day Of The Week vs Tips", x = "Trip Day Of The Week", y = "Tips")